from datasets import load_dataset
import os.path as osp
import json
import tqdm
import os
import zipfile


def process(cfg):
    data_dir, split = cfg.dataset_path, cfg.split
    name = cfg.get("dataset_name", "")
    # download VSI-Bench
    output_root = osp.join(cfg.processed_dataset_path, name)
    output_dir = osp.join(output_root, split)
    os.makedirs(output_dir, exist_ok=True)
    cmd = f"huggingface-cli download --repo-type dataset --resume-download {data_dir} --local-dir {output_root}"
    os.system(cmd)
    dataset = load_dataset(output_root, split=split)
    content = []
    # The indexes follow https://github.com/vision-x-nyu/thinking-in-space/blob/9d2ab309f875018a6898ad847d70e255bb184e79/lmms_eval/tasks/vsibench/utils.py#L79
    subset_indexes = set(
        [
            3,
            9,
            22,
            27,
            37,
            41,
            42,
            80,
            81,
            91,
            106,
            125,
            141,
            145,
            151,
            153,
            154,
            166,
            182,
            188,
            202,
            206,
            219,
            285,
            289,
            379,
            380,
            423,
            427,
            435,
            443,
            446,
            462,
            463,
            485,
            512,
            515,
            521,
            530,
            534,
            536,
            552,
            556,
            562,
            565,
            586,
            594,
            602,
            608,
            609,
            610,
            613,
            616,
            617,
            619,
            626,
            654,
            661,
            662,
            666,
            672,
            673,
            675,
            679,
            686,
            696,
            699,
            717,
            825,
            839,
            842,
            869,
            874,
            879,
            913,
            927,
            935,
            968,
            987,
            1023,
            1036,
            1049,
            1059,
            1125,
            1149,
            1164,
            1199,
            1213,
            1215,
            1235,
            1271,
            1292,
            1321,
            1338,
            1358,
            1359,
            1373,
            1376,
            1378,
            1381,
            1395,
            1405,
            1417,
            1447,
            1462,
            1463,
            1471,
            1472,
            1479,
            1485,
            1500,
            1519,
            1528,
            1531,
            1554,
            1560,
            1570,
            1571,
            1575,
            1582,
            1584,
            1601,
            1645,
            1670,
            1682,
            1689,
            1690,
            1713,
            1741,
            1762,
            1795,
            1838,
            1855,
            1933,
            2019,
            2023,
            2051,
            2060,
            2073,
            2093,
            2101,
            2107,
            2136,
            2140,
            2145,
            2151,
            2154,
            2224,
            2231,
            2233,
            2242,
            2295,
            2309,
            2333,
            2336,
            2357,
            2370,
            2372,
            2373,
            2392,
            2394,
            2413,
            2424,
            2440,
            2478,
            2488,
            2491,
            2496,
            2497,
            2557,
            2559,
            2568,
            2574,
            2586,
            2615,
            2635,
            2652,
            2653,
            2663,
            2672,
            2678,
            2681,
            2682,
            2687,
            2689,
            2693,
            2698,
            2709,
            2715,
            2747,
            2752,
            2757,
            2771,
            2786,
            2794,
            2795,
            2797,
            2819,
            2851,
            2869,
            2871,
            2876,
            2891,
            2894,
            2897,
            2910,
            2919,
            2969,
            2975,
            2978,
            2985,
            3016,
            3027,
            3046,
            3070,
            3072,
            3075,
            3089,
            3102,
            3133,
            3143,
            3150,
            3179,
            3180,
            3182,
            3306,
            3307,
            3312,
            3313,
            3329,
            3348,
            3367,
            3390,
            3401,
            3404,
            3432,
            3464,
            3532,
            3561,
            3585,
            3586,
            3657,
            3666,
            3697,
            3711,
            3712,
            3716,
            3728,
            3730,
            3767,
            3778,
            3780,
            3781,
            3783,
            3787,
            3806,
            3814,
            3819,
            3832,
            3833,
            3836,
            3839,
            3842,
            3859,
            3874,
            3880,
            3882,
            3886,
            3938,
            3950,
            3956,
            3966,
            3967,
            3975,
            4011,
            4052,
            4074,
            4077,
            4082,
            4084,
            4102,
            4139,
            4144,
            4145,
            4147,
            4149,
            4171,
            4180,
            4183,
            4184,
            4188,
            4198,
            4214,
            4237,
            4240,
            4243,
            4298,
            4316,
            4330,
            4359,
            4362,
            4364,
            4376,
            4381,
            4387,
            4403,
            4405,
            4424,
            4430,
            4438,
            4453,
            4457,
            4458,
            4476,
            4507,
            4508,
            4512,
            4547,
            4551,
            4558,
            4565,
            4583,
            4585,
            4630,
            4645,
            4661,
            4663,
            4689,
            4697,
            4698,
            4707,
            4750,
            4764,
            4768,
            4773,
            4798,
            4818,
            4838,
            4840,
            4844,
            4870,
            4874,
            4890,
            4900,
            4922,
            4923,
            4925,
            4932,
            4939,
            4945,
            4963,
            4964,
            4965,
            4966,
            4970,
            4972,
            4974,
            4975,
            4978,
            4983,
            4989,
            4990,
            4991,
            4993,
            5002,
            5005,
            5007,
            5008,
            5024,
            5026,
            5038,
            5040,
            5044,
            5045,
            5048,
            5049,
            5050,
            5051,
            5055,
            5058,
            5063,
            5064,
            5071,
            5075,
            5090,
            5103,
            5112,
            5113,
            5114,
            5117,
            5120,
            5121,
            5124,
            5131,
            5134,
            5135,
            5136,
            5139,
            5146,
            5147,
        ]
    )
    content_mini = []
    for i, annotation in tqdm.tqdm(enumerate(dataset)):
        info = {
            "question_id": annotation["id"],
            "question": annotation["question"],
            "answer": annotation["ground_truth"],
            "question_type": annotation["question_type"],
            "dataset": annotation["dataset"],
            "scene_name": annotation["scene_name"],
        }
        if annotation["options"] is not None:
            info["question"] += "\nOptions:\n" + "\n".join(annotation["options"])
        video_path = f"{annotation['dataset']}/{annotation['scene_name']}.mp4"
        info["video_path"] = video_path
        content.append(info)
        if annotation["id"] in subset_indexes:
            content_mini.append(info)
    with open(osp.join(output_dir, "data.json"), "w") as f:
        json.dump(content, f, indent=2)
    with open(osp.join(output_dir, "data_tiny.json"), "w") as f:
        json.dump(content_mini, f, indent=2)
    # unzip the datasets
    for dataset in ["arkitscenes", "scannetpp", "scannet"]:
        zip_path = osp.join(output_root, f"{dataset}.zip")
        if osp.exists(zip_path):
            with zipfile.ZipFile(zip_path, "r") as zip_ref:
                zip_ref.extractall(output_dir)
    print(
        "\033[91m"
        + f"Please unzip arkitscenes.zip, scannetpp.zip, and scannet.zip in {output_dir} manually."
        + "\033[0m"
    )
